#############################################
# differential abundance analysis by distance
#############################################

library(ggordiplots)
library(microViz)
library("vegan")
library(phyloseqCompanion)
library(ggplot2)
library(Rmisc)
library(ggsignif)
library(ggpubr)
library(dplyr)
library("gridExtra")
library(microbiomeMarker)
library(mblm)
library(tidyr)
require(MASS)
require(boot)
require(ggplot2)
require(pscl)

setwd("~/Research/Mycorrhizae/Fieldwork/Genetics/NGS sequencing/Code")

# load physeq object
load("~/Research/Mycorrhizae/Fieldwork/Genetics/NGS sequencing/Code/phyloseq final variables.RData")

# set scientific notation off
options(scipen=999)

soil.data <- read.csv("soil.data.final.csv")

# add covariates to phyloseq object
sample_data(physeq)$organic.soil.C.N <- soil.data$organic.soil.C.N[1:96]
sample_data(physeq)$organic.soil.GWC <- soil.data$organic.soil.GWC[1:96]
sample_data(physeq)$carcass.deposition <- soil.data$carcass.deposition[1:96]
sample_data(physeq)$ph <- soil.data$ph[1:96]
sample_data(physeq)$decomposing.carcass <- soil.data$decomposing.carcass[1:96]
sample_data(physeq)$white.spruce <- soil.data$white.spruce[1:96]
sample_data(physeq)$paper.birch <- soil.data$paper.birch[1:96]
sample_data(physeq)$dwarf.birch <- soil.data$dwarf.birch[1:96]
sample_data(physeq)$moss <- soil.data$moss[1:96]
sample_data(physeq)$vaccinium <- soil.data$vaccinium[1:96]
sample_data(physeq)$fern <- soil.data$fern[1:96]
sample_data(physeq)$kinnikinnick <- soil.data$kinnikinnick[1:96]
sample_data(physeq)$heather <- soil.data$heather[1:96]
sample_data(physeq)$eriophorum <- soil.data$eriophorum[1:96]
sample_data(physeq)$fireweed <- soil.data$fireweed[1:96]
sample_data(physeq)$horsetail <- soil.data$horsetail[1:96]
sample_data(physeq)$alder <- soil.data$alder[1:96]
sample_data(physeq)$hogsweed <- soil.data$hogsweed[1:96]
sample_data(physeq)$willow <- soil.data$willow[1:96]
sample_data(physeq)$redberry <- soil.data$redberry[1:96]
sample_data(physeq)$azalea <- soil.data$azalea[1:96]
sample_data(physeq)$slope <- soil.data$slope[1:96]

# use relative abundance
physeq.prop <- transform_sample_counts(physeq, function(x) x/sum(x))

#################################################
# LM model to compare the relative abundance of different genera by C:N at all streams
###################################################

# filter by need
physeq_nobog <- subset_samples(physeq.prop, Transect != "Bog")
physeq_hansen <- subset_samples(physeq.prop, Stream == "Hansen")
physeq_happy <- subset_samples(physeq.prop, Stream == "Happy")
physeq_yako <- subset_samples(physeq.prop, Stream == "Yako")
physeq_hansen_right <- subset_samples(physeq_hansen, Side=="SR")
physeq_hansen_left <- subset_samples(physeq_hansen, Side=="SL")

# agglomerate by genus
genus.glom <- tax_glom(physeq.prop, taxrank = 'Genus')

# get genus abundance matrix from phyloseq object
dat_genus <- psmelt(genus.glom)
dat_genus$Genus <- as.character(dat_genus$Genus)
dat_genus$Genus <- dat_genus$Genus %>% replace_na('Unclassified')
genus.abund <- subset(dat_genus, select = c("Sample", "Abundance", "Stream", "Transect", "Side", "Distance", "organic.soil.C.N", "organic.soil.GWC",
                                            "carcass.deposition", "ph", "decomposing.carcass", "Genus", "white.spruce", "paper.birch", 
    "dwarf.birch", "moss", "vaccinium", "fern", "kinnikinnick", "heather", "eriophorum", "fireweed", "horsetail", "alder", "hogsweed", 
    "willow", "redberry", "azalea","slope"))

# create unique number for each genera
genus.abund <- transform(genus.abund,id=as.numeric(factor(Genus)))

# check how many genera there are based on which dataset you are using
length(unique(genus.abund$Genus))

# create vector for each genera and p value
pvalue <- c(1:308)
size <- c(1:308)
abund <- c(1:308)

for (i in 1:308){
  genus.final <- genus.abund[which(genus.abund$id==i),]
  #mod = mblm(Abundance ~ organic.soil.C.N,data=genus.final)
  mod = glm(Abundance ~ organic.soil.C.N, data=genus.final, family="poisson")
  #mod = zeroinfl(Abundance ~ organic.soil.C.N, data=genus.final)
  temp <- summary(mod)
  pvalue[i] <- temp$coefficients[2,4]
  size[i] <- temp$coefficients[2,1]
}

# create dataframe with genus and p value combined
genus.p <- data.frame(levels(factor(genus.abund$Genus)), pvalue, size)
genus.p <- genus.p[order(pvalue),]
genus.sig <- genus.p[which(genus.p$pvalue < 0.051),]
colnames(genus.sig) <- c("Genus","pvalue","size")
View(genus.sig)

#save
write.csv(genus.sig, file='~/Research/Mycorrhizae/Fieldwork/Genetics/NGS sequencing/Code/Abundance metrics/RA.C.N.allstreams.csv')

# export means by stream for each genus to make bar graph in excel
final <- genus.abund[genus.abund$Genus %in% genus.sig$Genus,]
tgc <- summarySE(final, measurevar="Abundance", groupvars=c("Genus"))
tgc_order <- tgc[order(-tgc$Abundance),]
tgc_order

# visualize this
# put distance 5 into distance 6 bin
for (i in 1:length(genus.abund$Distance)){
  if (genus.abund$Distance[i]==5){
    genus.abund$Distance[i] <- 6}
}

# examine phyla individually with different models
# non-parametric regression is probably better, try the Kendall-Theil regression
genus.final <- genus.abund[which(genus.abund$Genus=="Paxillus"),]
genus.final <- na.omit(genus.final)
mod = mblm(Abundance ~ Distance,data=genus.final)

all.parms <- lm(Abundance ~ organic.soil.C.N + organic.soil.GWC + white.spruce + paper.birch + dwarf.birch + moss + vaccinium + fern + kinnikinnick + 
                  heather + eriophorum + fireweed + horsetail + alder + hogsweed + willow + redberry + azalea + slope + Distance +
                  carcass.deposition + decomposing.carcass, data=genus.final)
step.model <- stepAIC(all.parms, direction = "both", trace = FALSE)
summary(step.model)

# examine data by stream
ggplot(genus.final, aes(x=Abundance, y=Stream)) + geom_boxplot(alpha=0.2) + stat_summary(fun=mean, geom="point")

# russula increased with C:N,, horsetail, and azalea
# mortierella decreased with GWC, moss, azalea, paper birch, and increased with distance
# Laccaria decreased with slope, horstail, and redberry
# inocybe decreased with white spruce
# tylospora decreased with C:N and hogsweed, increased with GWC, slope, and moss
# cortinarius increased with C:N and slope
# solicoccozyma decreased with decomposing carcass
# lactarius decreased with GWC and azalea and other stuff
# tomentella increased with carcass deposition
# naucoria increased with GWC and carcass deposition

# paxillus increased with decomposing carcass, and decreased with white spruce, paper birch, moss, and hogsweed (without plants, only
# increased with decomposing carcass)
# boletus decreased with C:N, and increased with carcass deposition and GWC
# alpova decreased with white spruce and paper birch, and fern (without plants, decreased with C:N)
# xerocomus increased with decomposing carcass

# create estimates based on model
pred_interval <- c(1,3,6,10,20,40,60,100)
example <- data.frame(Distance=pred_interval)
pred1 <- predict(mod, newdata=example, re.form=NA, type="response", interval='confidence')
pred1 <- as.data.frame(pred1)
pred1$Distance <- c(1,3,6,10,20,40,60,100)

# summarize data by distance
tgc <- summarySE(genus.final, measurevar="Abundance", groupvars=c("Distance"))

# plot 
ggplot(tgc, aes(x=Distance, y=Abundance)) + geom_bar(position = "dodge", 
     stat = "summary", fun = "mean") + theme_classic() + ggtitle("Solicoccozyma")+
  theme( axis.text.x = element_text(face = "bold",colour = "black", size = 12), 
         axis.text.y = element_text(face = "bold", size = 11, colour = "black"), 
         axis.title= element_text(face = "bold", size = 14, colour = "black"), 
         panel.background = element_blank(), legend.position="none",
         panel.border = element_rect(fill = NA, colour = "black")) + 
  ylab("Relative Abundance") + 
  geom_errorbar(aes(ymin=Abundance-se, ymax=Abundance+se), 
                width=.2,position=position_dodge(.9)) +
  geom_line(data=pred1, aes(x=Distance, y=fit)) + 
  geom_line(data=pred1, aes(x=Distance, y=fit+0.05*upr), linetype="dotted")+
  geom_line(data=pred1, aes(x=Distance, y=fit-0.05*upr), linetype="dotted")

#################################################
# run GLM with proportion/count data
#################################################

# combine phyloseq object with alpha diversity metrics 
data <- cbind(sample_data(physeq_hansen), data = genus.abund, family = poisson)

mod1 <- betareg(Abundance ~ organic.soil.C.N + organic.soil.GWC + carcass.deposition + decomposing.carcass, data = genus.abund)
summary(mod1)

#################################################
# LM model to compare the abundance of different families by C:N at all streams
###################################################

# filter by need
physeq_nobog <- subset_samples(physeq.prop, Transect != "Bog")
physeq_hansen <- subset_samples(physeq.prop, Stream == "Hansen")
physeq_happy <- subset_samples(physeq.prop, Stream == "Happy")
physeq_yako <- subset_samples(physeq.prop, Stream == "Yako")
physeq_hansen_right <- subset_samples(physeq_hansen, Side=="SR")
physeq_hansen_left <- subset_samples(physeq_hansen, Side=="SL")

# agglomerate by genus
family.glom <- tax_glom(physeq_hansen_left, taxrank = 'Family')

# get genus abundance matrix from phyloseq object
dat_family <- psmelt(family.glom)
dat_family$Family <- as.character(dat_family$Family)
dat_family$Family <- dat_family$Family %>% replace_na('Unclassified')
family.abund <- dat_family[,c(3,5,6,7,8,9,14)]

# create unique number for each genera
family.abund <- transform(family.abund,id=as.numeric(factor(Family)))

# check how many genera there are based on which dataset you are using
length(unique(family.abund$id))

# create vector for each genera and p value
pvalue <- c(1:154)
size <- c(1:154)
abund <- c(1:154)

for (i in 1:154){
  family.final <- family.abund[which(family.abund$id==i),]
  mod = mblm(Abundance ~ organic.soil.C.N,data=family.final)
  temp <- summary(mod)
  pvalue[i] <- temp$coefficients[2,4]
  size[i] <- temp$coefficients[2,1]
}

# create dataframe with family and p value combined
family.p <- data.frame(levels(factor(family.abund$Family)), pvalue, size)
family.p <- family.p[order(pvalue),]
family.sig <- family.p[which(family.p$pvalue < 0.051),]
colnames(family.sig) <- c("family","pvalue","size")
View(family.sig)

#save
write.csv(family.sig, file='~/Research/Mycorrhizae/Fieldwork/Genetics/NGS sequencing/Code/Abundance metrics/RA.C.N.HansenSL.byfamily.csv')

# export means by stream for each genus to make bar graph in excel
final <- genus.abund[genus.abund$Genus %in% genus.sig$Genus,]
tgc <- summarySE(final, measurevar="Abundance", groupvars=c("Genus"))
tgc_order <- tgc[order(-tgc$Abundance),]
tgc_order

# visualize this
# put distance 5 into distance 6 bin
for (i in 1:length(genus.abund$Distance)){
  if (genus.abund$Distance[i]==5){
    genus.abund$Distance[i] <- 6}
}

# examine phyla individually
# non-parametric regression is probably better, try the Kendall-Theil regression
genus.final <- genus.abund[which(genus.abund$Genus=="Naucoria"),]
mod = mblm(Abundance ~ Distance,data=genus.final)
summary(mod)

# create estimates based on model
pred_interval <- c(1,3,6,10,20,40,60,100)
example <- data.frame(Distance=pred_interval)
pred1 <- predict(mod, newdata=example, re.form=NA, type="response", interval='confidence')
pred1 <- as.data.frame(pred1)
pred1$Distance <- c(1,3,6,10,20,40,60,100)

# summarize data by distance
tgc <- summarySE(genus.final, measurevar="Abundance", groupvars=c("Distance"))

# plot 
ggplot(tgc, aes(x=Distance, y=Abundance)) + geom_bar(position = "dodge", 
                                                     stat = "summary", fun = "mean") + theme_classic() + ggtitle("Solicoccozyma")+
  theme( axis.text.x = element_text(face = "bold",colour = "black", size = 12), 
         axis.text.y = element_text(face = "bold", size = 11, colour = "black"), 
         axis.title= element_text(face = "bold", size = 14, colour = "black"), 
         panel.background = element_blank(), legend.position="none",
         panel.border = element_rect(fill = NA, colour = "black")) + 
  ylab("Relative Abundance") + 
  geom_errorbar(aes(ymin=Abundance-se, ymax=Abundance+se), 
                width=.2,position=position_dodge(.9)) +
  geom_line(data=pred1, aes(x=Distance, y=fit)) + 
  geom_line(data=pred1, aes(x=Distance, y=fit+0.05*upr), linetype="dotted")+
  geom_line(data=pred1, aes(x=Distance, y=fit-0.05*upr), linetype="dotted")

#####################################################################################
# LM model to test changes in abundance of different phylum by distance at all streams
####################################################################################

# filter
physeq_nobog <- subset_samples(physeq.prop, Transect != "Bog")
physeq_hansen <- subset_samples(physeq.prop, Stream == "Hansen")
physeq_happy <- subset_samples(physeq.prop, Stream == "Happy")
physeq_yako <- subset_samples(physeq.prop, Stream == "Yako")
physeq_hansen_right <- subset_samples(physeq_hansen, Side=="SR")
physeq_hansen_left <- subset_samples(physeq_hansen, Side=="SL")

# agglomerate by phylum
phylum.glom <- tax_glom(physeq_hansen, taxrank = 'Phylum')

# get genus abundance matrix from phyloseq object
dat_phylum <- psmelt(phylum.glom)
dat_phylum$Phylum <- as.character(dat_phylum$Phylum)
dat_phylum$Phylum <- dat_phylum$Phylum %>% replace_na('Unclassified')
phylum.abund <- dat_phylum[,c(3,5,8,10)]

# create unique number for each genera
phylum.abund <- transform(phylum.abund,id=as.numeric(factor(Phylum)))

# create vector for each genera and p value
pvalue <- c(1:10)
size <- c(1:10)
abund <- c(1:10)

for (i in 1:10){
  phylum.final <- phylum.abund[which(phylum.abund$id==i),]
  mod = mblm(Abundance ~ Distance,data=phylum.final)
  temp <- summary(mod)
  pvalue[i] <- temp$coefficients[2,4]
  size[i] <- temp$coefficients[2,1]
  abund[i] <- phylum.final$Abundance
}

# create dataframe with genus and p value combined
phylum.p <- data.frame(levels(factor(phylum.abund$Phylum)), pvalue, size, abund)
phylum.p <- phylum.p[order(pvalue),]
phylum.sig <- phylum.p[which(phylum.p$pvalue < 0.051),]
colnames(phylum.sig) <- c("Phylum","pvalue","size","abundance")
View(phylum.sig)

# order by abundance before saving
phylum.sig <- phylum.sig[order(-phylum.sig$abundance),]
write.csv(phylum.sig, "sig.phylumabundance.bydistance.Hansen.left.csv")

# visualize this
# put distance 5 into distance 6 bin
for (i in 1:length(phylum.abund$Distance)){
  if (phylum.abund$Distance[i]==5){
    phylum.abund$Distance[i] <- 6}
}

# examine phyla individually
# non-parametric regression is probably better, try the Kendall-Theil regression
phylum.final <- phylum.abund[which(phylum.abund$Phylum=="Mucoromycota"),]
mod = mblm(Abundance ~ Distance,data=phylum.final)
summary(mod)

# create estimates based on model
pred_interval <- c(1,3,6,10,20,40,60,100)
example <- data.frame(Distance=pred_interval)
pred1 <- predict(mod, newdata=example, re.form=NA, type="response", interval='confidence')
pred1 <- as.data.frame(pred1)
pred1$Distance <- c(1,3,6,10,20,40,60,100)

# summarize data by distance
tgc <- summarySE(phylum.final, measurevar="Abundance", groupvars=c("Distance"))

# plot 
ggplot(tgc, aes(x=Distance, y=Abundance)) + geom_bar(position = "dodge", 
                                                     stat = "summary", fun = "mean") + theme_classic() + ggtitle("Mucoromycota")+
  theme( axis.text.x = element_text(face = "bold",colour = "black", size = 12), 
         axis.text.y = element_text(face = "bold", size = 11, colour = "black"), 
         axis.title= element_text(face = "bold", size = 14, colour = "black"), 
         panel.background = element_blank(), legend.position="none",
         panel.border = element_rect(fill = NA, colour = "black")) + 
  ylab("Relative Abundance") + 
  geom_errorbar(aes(ymin=Abundance-se, ymax=Abundance+se), 
                width=.2,position=position_dodge(.9)) +
  geom_line(data=pred1, aes(x=Distance, y=fit)) + 
  geom_line(data=pred1, aes(x=Distance, y=fit+0.05*upr), linetype="dotted")+
  geom_line(data=pred1, aes(x=Distance, y=fit-0.05*upr), linetype="dotted")


